rm(list=ls()) # clean env
options(scipen=999) # seed randomness
library(tidyr)
library(ggplot2)
library(ggExtra)
library(MASS)
library(car)
library(nnet)
library(caret)
data <- read.table('winequality-red.csv', sep=",", header=T, stringsAsFactors=T)
head(data)
n <- nrow(data) #n#
p <- ncol(data) #p#
summary(data)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900 Min. :0.01200
1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900 1st Qu.:0.07000
Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200 Median :0.07900
Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539 Mean :0.08747
3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600 3rd Qu.:0.09000
Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500 Max. :0.61100
free.sulfur.dioxide total.sulfur.dioxide density pH
Min. : 1.00 Min. : 6.00 Min. :0.9901 Min. :2.740
1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956 1st Qu.:3.210
Median :14.00 Median : 38.00 Median :0.9968 Median :3.310
Mean :15.87 Mean : 46.47 Mean :0.9967 Mean :3.311
3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978 3rd Qu.:3.400
Max. :72.00 Max. :289.00 Max. :1.0037 Max. :4.010
sulphates alcohol quality
Min. :0.3300 Min. : 8.40 Min. :3.000
1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
Median :0.6200 Median :10.20 Median :6.000
Mean :0.6581 Mean :10.42 Mean :5.636
3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
Max. :2.0000 Max. :14.90 Max. :8.000
unique(data$fixed.acidity)
[1] 7.4 7.8 11.2 7.9 7.3 7.5 6.7 5.6 8.9 8.5 8.1 7.6 6.9 6.3 7.1 8.3
[17] 5.2 5.7 8.8 6.8 4.6 7.7 8.7 6.4 6.6 8.6 10.2 7.0 7.2 9.3 8.0 9.7
[33] 6.2 5.0 4.7 8.4 10.1 9.4 9.0 8.2 6.1 5.8 9.2 11.5 5.4 9.6 12.8 11.0
[49] 11.6 12.0 15.0 10.8 11.1 10.0 12.5 11.8 10.9 10.3 11.4 9.9 10.4 13.3 10.6 9.8
[65] 13.4 10.7 11.9 12.4 12.2 13.8 9.1 13.5 10.5 12.6 14.0 13.7 9.5 12.7 12.3 15.6
[81] 5.3 11.3 13.0 6.5 12.9 14.3 15.5 11.7 13.2 15.9 12.1 5.1 4.9 5.9 6.0 5.5
unique(data$voltile.acidity)
NULL
unique(data$citric.acid)
[1] 0.00 0.04 0.56 0.06 0.02 0.36 0.08 0.29 0.18 0.19 0.28 0.51 0.48 0.31 0.21 0.11
[17] 0.14 0.16 0.24 0.07 0.12 0.25 0.09 0.30 0.20 0.22 0.15 0.43 0.52 0.23 0.37 0.26
[33] 0.57 0.40 0.49 0.05 0.54 0.64 0.70 0.47 0.44 0.17 0.68 0.53 0.10 0.01 0.55 1.00
[49] 0.03 0.42 0.33 0.32 0.35 0.60 0.74 0.58 0.50 0.76 0.46 0.45 0.38 0.39 0.66 0.62
[65] 0.67 0.79 0.63 0.61 0.71 0.65 0.59 0.34 0.69 0.73 0.72 0.41 0.27 0.75 0.13 0.78
unique(data$residual.sugar)
[1] 1.90 2.60 2.30 1.80 1.60 1.20 2.00 6.10 3.80 3.90 1.70 4.40 2.40
[14] 1.40 2.50 10.70 5.50 2.10 1.50 5.90 2.80 2.20 3.00 3.40 5.10 4.65
[27] 1.30 7.30 7.20 2.90 2.70 5.60 3.10 3.20 3.30 3.60 4.00 7.00 6.40
[40] 3.50 11.00 3.65 4.50 4.80 2.95 5.80 6.20 4.20 7.90 3.70 6.70 6.60
[53] 2.15 5.20 2.55 15.50 4.10 8.30 6.55 4.60 4.30 5.15 6.30 6.00 8.60
[66] 7.50 2.25 4.25 2.85 3.45 2.35 2.65 9.00 8.80 5.00 1.65 2.05 0.90
[79] 8.90 8.10 4.70 1.75 7.80 12.90 13.40 5.40 15.40 3.75 13.80 5.70 13.90
unique(data$chlorides)
[1] 0.076 0.098 0.092 0.075 0.069 0.065 0.073 0.071 0.097 0.089 0.114 0.176 0.170
[14] 0.368 0.086 0.341 0.077 0.082 0.106 0.084 0.085 0.080 0.105 0.083 0.103 0.066
[27] 0.172 0.074 0.088 0.332 0.050 0.054 0.113 0.068 0.081 0.110 0.070 0.111 0.079
[40] 0.115 0.094 0.093 0.104 0.464 0.401 0.062 0.107 0.045 0.058 0.102 0.467 0.091
[53] 0.122 0.090 0.119 0.178 0.146 0.072 0.118 0.049 0.060 0.117 0.087 0.236 0.610
[66] 0.095 0.100 0.360 0.067 0.270 0.099 0.046 0.061 0.056 0.039 0.059 0.101 0.057
[79] 0.337 0.078 0.263 0.063 0.611 0.064 0.096 0.358 0.343 0.186 0.112 0.213 0.214
[92] 0.121 0.128 0.052 0.120 0.116 0.109 0.159 0.124 0.174 0.047 0.127 0.413 0.152
[105] 0.053 0.055 0.051 0.125 0.200 0.171 0.226 0.250 0.108 0.148 0.143 0.222 0.157
[118] 0.422 0.034 0.387 0.415 0.243 0.241 0.190 0.132 0.126 0.038 0.044 0.041 0.165
[131] 0.048 0.145 0.147 0.012 0.194 0.161 0.123 0.414 0.216 0.043 0.042 0.369 0.166
[144] 0.136 0.403 0.137 0.168 0.153 0.267 0.169 0.205 0.235 0.230
unique(data$free.sulfur.dioxide)
[1] 11.0 25.0 15.0 17.0 13.0 9.0 16.0 52.0 51.0 35.0 6.0 29.0 23.0 10.0 21.0 4.0
[17] 14.0 8.0 22.0 40.0 5.0 3.0 7.0 12.0 30.0 33.0 50.0 19.0 20.0 27.0 18.0 28.0
[33] 34.0 42.0 41.0 37.0 32.0 36.0 24.0 26.0 39.0 40.5 68.0 31.0 38.0 43.0 47.0 1.0
[49] 54.0 46.0 45.0 2.0 5.5 53.0 37.5 57.0 48.0 72.0 55.0 66.0
unique(data$total.sulfur.dioxide)
[1] 34.0 67.0 54.0 60.0 40.0 59.0 21.0 18.0 102.0 65.0 29.0 145.0 148.0
[14] 103.0 56.0 71.0 37.0 23.0 11.0 35.0 16.0 82.0 113.0 83.0 50.0 15.0
[27] 30.0 19.0 87.0 46.0 14.0 114.0 12.0 96.0 119.0 73.0 45.0 10.0 110.0
[40] 52.0 112.0 39.0 27.0 94.0 43.0 42.0 80.0 51.0 61.0 136.0 31.0 125.0
[53] 24.0 140.0 133.0 85.0 106.0 22.0 36.0 69.0 64.0 153.0 47.0 108.0 111.0
[66] 62.0 28.0 89.0 13.0 90.0 134.0 99.0 26.0 63.0 105.0 20.0 141.0 88.0
[79] 129.0 128.0 86.0 121.0 101.0 44.0 8.0 49.0 38.0 143.0 144.0 127.0 126.0
[92] 120.0 55.0 93.0 95.0 41.0 58.0 72.0 81.0 109.0 33.0 53.0 98.0 48.0
[105] 70.0 25.0 135.0 92.0 74.0 32.0 77.0 165.0 75.0 124.0 78.0 122.0 66.0
[118] 68.0 17.0 91.0 76.0 151.0 142.0 116.0 149.0 57.0 104.0 84.0 147.0 155.0
[131] 152.0 9.0 139.0 130.0 7.0 100.0 115.0 6.0 79.0 278.0 289.0 160.0 77.5
[144] 131.0
unique(data$density)
[1] 0.99780 0.99680 0.99700 0.99800 0.99640 0.99460 0.99590 0.99430 0.99740 0.99860
[11] 0.99690 0.99820 0.99660 0.99550 0.99620 0.99720 0.99580 0.99930 0.99570 0.99750
[21] 0.99400 0.99760 0.99340 0.99540 0.99710 0.99560 0.99830 0.99670 0.99610 0.99840
[31] 0.99380 0.99320 0.99650 0.99630 0.99600 0.99730 0.99880 0.99370 0.99520 0.99160
[41] 0.99440 0.99960 0.99500 0.99810 0.99530 0.99240 0.99480 0.99695 0.99545 0.99615
[51] 0.99940 0.99625 0.99585 0.99685 0.99655 0.99525 0.99815 0.99745 0.99270 0.99675
[61] 0.99925 0.99565 1.00005 0.99850 0.99965 0.99575 0.99990 1.00025 0.99870 0.99935
[71] 0.99735 0.99915 0.99910 1.00015 0.99970 1.00100 0.99790 1.00140 1.00010 0.99855
[81] 0.99845 0.99980 0.99645 0.99865 0.99890 0.99975 0.99900 1.00150 1.00020 0.99920
[91] 1.00080 1.00000 1.00060 1.00040 1.00180 0.99120 1.00220 1.00030 0.99490 0.99510
[101] 1.00320 0.99470 0.99950 0.99770 1.00260 1.00315 1.00210 0.99170 0.99220 0.99210
[111] 0.99788 1.00024 0.99768 0.99782 0.99761 0.99803 0.99785 0.99656 0.99488 0.99823
[121] 0.99779 0.99738 0.99701 0.99888 0.99938 0.99744 0.99668 0.99727 0.99586 0.99612
[131] 0.99676 0.99732 0.99814 0.99746 0.99708 0.99818 0.99639 0.99531 0.99786 0.99526
[141] 0.99641 0.99264 0.99682 0.99356 0.99386 0.99702 0.99693 0.99562 1.00012 0.99462
[151] 0.99939 0.99632 0.99976 0.99606 0.99154 0.99624 0.99417 0.99376 0.99832 0.99836
[161] 0.99694 0.99064 0.99672 0.99647 0.99736 0.99629 0.99689 0.99801 0.99652 0.99538
[171] 0.99594 0.99686 0.99438 0.99357 0.99628 0.99748 0.99578 0.99371 0.99522 0.99576
[181] 0.99552 0.99664 0.99614 0.99517 0.99787 0.99533 0.99536 0.99824 0.99577 0.99491
[191] 1.00289 0.99743 0.99774 0.99444 0.99892 0.99528 0.99331 0.99901 0.99674 0.99512
[201] 0.99395 0.99504 0.99516 0.99604 0.99468 0.99543 0.99791 0.99425 0.99509 0.99484
[211] 0.99834 0.99864 0.99498 0.99566 0.99408 0.99458 0.99648 0.99568 0.99613 0.99519
[221] 0.99518 0.99592 0.99654 0.99546 0.99554 0.99733 0.99669 0.99724 0.99643 0.99605
[231] 0.99658 0.99416 0.99712 0.99418 0.99596 0.99556 0.99918 0.99697 0.99378 0.99162
[241] 0.99495 0.99280 0.99603 0.99549 0.99722 0.99354 0.99635 0.99454 0.99598 0.99486
[251] 0.99007 0.99636 0.99642 0.99584 0.99506 0.99822 0.99364 0.99514 0.99854 0.99739
[261] 0.99683 0.99692 0.99756 0.99547 0.99859 0.99294 0.99634 0.99704 0.99258 0.99426
[271] 0.99747 0.99784 0.99358 0.99572 0.99769 0.99534 0.99817 0.99316 0.99471 0.99617
[281] 0.99529 0.99451 0.99479 0.99772 0.99666 0.99392 0.99388 0.99402 0.99360 0.99374
[291] 0.99523 0.99593 0.99396 0.99698 0.99020 0.99252 0.99256 0.99235 0.99352 0.99557
[301] 0.99394 0.99150 0.99379 0.99798 0.99341 0.99330 0.99684 0.99524 0.99764 0.99588
[311] 0.99473 0.99616 0.99622 0.99544 0.99728 0.99551 0.99434 0.99709 0.99384 0.99502
[321] 0.99667 0.99649 0.99716 0.99541 0.99318 0.99346 0.99599 0.99478 0.99754 0.99439
[331] 0.99633 0.99419 0.99878 0.99752 0.99428 0.99659 0.99677 0.99734 0.99678 0.99638
[341] 0.99922 0.99157 0.99718 0.99621 0.99242 0.99494 0.99729 0.99414 0.99721 0.99627
[351] 0.99569 0.99499 0.99437 0.99726 0.99456 0.99564 0.99080 0.99084 0.99350 0.99385
[361] 0.99688 0.99619 0.99476 0.99328 0.99286 0.99914 0.99521 0.99362 0.99558 0.99323
[371] 0.99191 0.99501 0.99290 0.99532 0.99796 0.99581 0.99608 0.99387 0.99448 0.99589
[381] 0.99852 0.99472 0.99587 0.99332 0.99464 0.99699 0.99725 0.99623 0.99609 0.99292
[391] 0.99420 1.00369 0.99713 0.99322 0.99706 0.99974 0.99467 0.99236 0.99705 0.99334
[401] 0.99336 1.00242 0.99182 0.99808 0.99828 0.99719 0.99542 0.99496 0.99344 0.99348
[411] 0.99459 0.99492 0.99508 0.99582 0.99555 0.99410 0.99661 0.99842 0.99489 0.99665
[421] 0.99553 0.99714 0.99631 0.99573 0.99717 0.99397 0.99646 0.99758 0.99306 0.99783
[431] 0.99765 0.99474 0.99483 0.99314 0.99574 0.99651
unique(data$pH)
[1] 3.51 3.20 3.26 3.16 3.30 3.39 3.36 3.35 3.28 3.58 3.17 3.11 3.38 3.04 3.52 3.43
[17] 3.34 3.47 3.46 3.45 3.40 3.42 3.23 3.50 3.33 3.21 3.48 3.90 3.25 3.32 3.15 3.41
[33] 3.44 3.31 3.54 3.13 2.93 3.14 3.75 3.85 3.29 3.08 3.37 3.19 3.07 3.49 3.53 3.24
[49] 3.63 3.22 3.68 2.74 3.59 3.00 3.12 3.57 3.61 3.06 3.60 3.69 3.10 3.05 3.67 3.27
[65] 3.18 3.02 3.55 2.99 3.01 3.56 3.03 3.62 2.88 2.95 2.98 3.09 2.86 3.74 2.92 3.72
[81] 2.87 2.89 2.94 3.66 3.71 3.78 3.70 4.01 2.90
unique(data$sulphates)
[1] 0.56 0.68 0.65 0.58 0.46 0.47 0.57 0.80 0.54 0.52 1.56 0.88 0.93 0.75 1.28 0.50
[17] 1.08 0.53 0.91 0.63 0.59 0.55 0.66 0.60 0.73 0.48 0.83 0.51 0.90 1.20 0.74 0.64
[33] 0.77 0.71 0.62 0.39 0.79 0.95 0.82 1.12 1.14 0.78 1.95 1.22 1.98 0.61 1.31 0.69
[49] 0.67 0.70 0.49 0.92 2.00 0.72 1.59 0.33 1.02 0.97 0.85 0.43 1.03 0.86 0.76 1.61
[65] 1.09 0.84 0.96 0.45 1.26 0.87 0.81 1.00 1.36 1.18 0.89 0.98 1.13 1.04 1.11 0.99
[81] 1.07 0.44 1.06 1.05 0.42 1.17 1.62 0.94 1.34 1.16 1.10 0.40 1.15 0.37 1.33 1.01
unique(data$alcohol)
[1] 9.400000 9.800000 10.000000 9.500000 10.500000 9.200000 9.900000 9.100000
[9] 9.300000 9.000000 9.700000 10.100000 10.600000 9.600000 10.800000 10.300000
[17] 13.100000 10.200000 10.900000 10.700000 12.900000 10.400000 13.000000 14.000000
[25] 11.500000 11.400000 12.400000 11.000000 12.200000 12.800000 12.600000 12.500000
[33] 11.700000 11.300000 12.300000 12.000000 11.900000 11.800000 8.700000 13.300000
[41] 11.200000 11.600000 11.100000 13.400000 12.100000 8.400000 12.700000 14.900000
[49] 13.200000 13.600000 13.500000 10.033333 9.550000 8.500000 11.066667 9.566667
[57] 10.550000 8.800000 13.566667 11.950000 9.950000 9.233333 9.250000 9.050000
[65] 10.750000
unique(data$quality)
[1] 5 6 7 4 8 3
data$label <- with(data, ifelse(quality >= 7, 'great',
ifelse(quality >= 5, 'good', 'poor')))
data$y <- with(data, ifelse(quality >= 7, 1, 0))
df <- data[,1:11]
cat <- data[,13]
lab <- data[,14]
pairs(df)
colMeans(data[,1:12])
fixed.acidity volatile.acidity citric.acid residual.sugar
8.31963727 0.52782051 0.27097561 2.53880550
chlorides free.sulfur.dioxide total.sulfur.dioxide density
0.08746654 15.87492183 46.46779237 0.99674668
pH sulphates alcohol quality
3.31111320 0.65814884 10.42298311 5.63602251
mvec <- colMeans(df) # sample mean vector
covM <- cov(df) # sample covariance matrix
corM <- cor(df) # sample correlation matrix
det(cov(df)) # generalized sample variance
[1] 0.00000000003478418
sum(diag(cov(df))) # total sample variance
[1] 1197.797
FindcrikChi <- function(n, p, alpha=0.5, N=1000){
cricvec <- rep(0, N) #vector for the rQ result collection#
for(i in 1:N){
#iteration to estimate rQ#
numvec <- rchisq(n, p) #generate a data set of size n, degree of freedom=p#
d <- sort(numvec)
q <- qchisq((1:n-0.5)/n, p)
cricvec[i] <- cor(d,q)
}
scricvec <- sort(cricvec)
cN <- ceiling(N* alpha) #to be on the safe side I use ceiling instead of floor(), take the 'worst' alpha*N cor as rQ, everything lower than that is deemed as rejection#
cricvalue <- scricvec[cN]
result <- list(cN, cricvalue, scricvec)
return(result)
}
critic <- FindcrikChi(n, p-1)
critic[[2]]
[1] 0.9993465
DensityPlots <- function(data_set){
for (col in names(data_set)){
print(mean(data_set[[col]]))
qqc <- qqnorm(data_set[[col]], main = paste("QQ - Plot: ", col))
corqq <- cor(qqc$x, qqc$y)
if (round(corqq,2) >= round(critic[[2]],3)){
qqline(data_set[[col]], col='blue', lwd=2)
print(paste('Data ', col, ' is Normally Distributed! with: ', round(corqq,3)))
} else {
qqline(data_set[[col]], col='orange', lwd=2)
print(paste('Data ', col, ' is NOT Normally Distributed! with: ', round(corqq,3)))
}
for ( i in 1:ncol(data_set)){
if (col != names(data_set[i])){
j <- names(data_set[i])
df_mean <- as.data.frame(colMeans(data_set[c(col, j)]))
plot <- ggplot(data = data_set) +
geom_point(mapping = aes(x = .data[[col]], y = .data[[j]])) +
geom_point(data=t(df_mean), mapping=aes(x = .data[[col]], y = .data[[j]]), col="red")
print(ggMarginal(plot, type="densigram"))
# or standard R
# plot(data_set[[col]], data_set[,i], col='blue', lwd=2, xlab=col, ylab=j)
# points(mean(data_set[[col]]), mean(data_set[,i]), col='red', lwd=8)
print(paste(col, ' vs ', names(data_set[i]), ': ',
cov(data_set[[col]], data_set[,i])))
}
}
}
}
DensityPlots(df)
[1] 8.319637
[1] "Data fixed.acidity is NOT Normally Distributed! with: 0.971"
[1] "fixed.acidity vs volatile.acidity : -0.0798514168351465"
[1] "fixed.acidity vs citric.acid : 0.227820003663115"
[1] "fixed.acidity vs residual.sugar : 0.281756262322901"
[1] "fixed.acidity vs chlorides : 0.0076786924869345"
[1] "fixed.acidity vs free.sulfur.dioxide : -2.8009214927039"
[1] "fixed.acidity vs total.sulfur.dioxide : -6.48234585758778"
[1] "fixed.acidity vs density : 0.00219522357567034"
[1] "fixed.acidity vs pH : -0.183585703596037"
[1] "fixed.acidity vs sulphates : 0.0540100915700598"
[1] "fixed.acidity vs alcohol : -0.114421153396092"
[1] 0.5278205
[1] "Data volatile.acidity is NOT Normally Distributed! with: 0.987"
[1] "volatile.acidity vs fixed.acidity : -0.0798514168351465"
[1] "volatile.acidity vs citric.acid : -0.01927162077597"
[1] "volatile.acidity vs residual.sugar : 0.000484190975899359"
[1] "volatile.acidity vs chlorides : 0.00051658691954687"
[1] "volatile.acidity vs free.sulfur.dioxide : -0.0196735903854177"
[1] "volatile.acidity vs total.sulfur.dioxide : 0.450425692371875"
[1] "volatile.acidity vs density : 0.00000744366515837123"
[1] "volatile.acidity vs pH : 0.0064946993036167"
[1] "volatile.acidity vs sulphates : -0.00792143384358653"
[1] "volatile.acidity vs alcohol : -0.0386002214306344"
[1] 0.2709756
[1] "Data citric.acid is NOT Normally Distributed! with: 0.977"
[1] "citric.acid vs fixed.acidity : 0.227820003663115"
[1] "citric.acid vs volatile.acidity : -0.01927162077597"
[1] "citric.acid vs residual.sugar : 0.0394342699716109"
[1] "citric.acid vs chlorides : 0.00186872477792362"
[1] "citric.acid vs free.sulfur.dioxide : -0.124252113922891"
[1] "citric.acid vs total.sulfur.dioxide : 0.227697274031564"
[1] "citric.acid vs density : 0.000134174581031167"
[1] "citric.acid vs pH : -0.0162975823437834"
[1] "citric.acid vs sulphates : 0.0103277145212003"
[1] "citric.acid vs alcohol : 0.0228151729295766"
[1] 2.538806
[1] "Data residual.sugar is NOT Normally Distributed! with: 0.752"
[1] "residual.sugar vs fixed.acidity : 0.281756262322901"
[1] "residual.sugar vs volatile.acidity : 0.000484190975899359"
[1] "residual.sugar vs citric.acid : 0.0394342699716109"
[1] "residual.sugar vs chlorides : 0.00369017590390114"
[1] "residual.sugar vs free.sulfur.dioxide : 2.75861145224526"
[1] "residual.sugar vs total.sulfur.dioxide : 9.4164414789907"
[1] "residual.sugar vs density : 0.000945410861841846"
[1] "residual.sugar vs pH : -0.0186442889838064"
[1] "residual.sugar vs sulphates : 0.00132094135806093"
[1] "residual.sugar vs alcohol : 0.0632189597926113"
[1] 0.08746654
[1] "Data chlorides is NOT Normally Distributed! with: 0.695"
[1] "chlorides vs fixed.acidity : 0.0076786924869345"
[1] "chlorides vs volatile.acidity : 0.00051658691954687"
[1] "chlorides vs citric.acid : 0.00186872477792362"
[1] "chlorides vs residual.sugar : 0.00369017590390114"
[1] "chlorides vs free.sulfur.dioxide : 0.00273830307740836"
[1] "chlorides vs total.sulfur.dioxide : 0.0733867502451861"
[1] "chlorides vs density : 0.000017821756780873"
[1] "chlorides vs pH : -0.00192574495871559"
[1] "chlorides vs sulphates : 0.00296187794937543"
[1] "chlorides vs alcohol : -0.0110915177743286"
[1] 15.87492
[1] "Data free.sulfur.dioxide is NOT Normally Distributed! with: 0.95"
[1] "free.sulfur.dioxide vs fixed.acidity : -2.8009214927039"
[1] "free.sulfur.dioxide vs volatile.acidity : -0.0196735903854177"
[1] "free.sulfur.dioxide vs citric.acid : -0.124252113922891"
[1] "free.sulfur.dioxide vs residual.sugar : 2.75861145224526"
[1] "free.sulfur.dioxide vs chlorides : 0.00273830307740836"
[1] "free.sulfur.dioxide vs total.sulfur.dioxide : 229.737520947463"
[1] "free.sulfur.dioxide vs density : -0.000433250416209755"
[1] "free.sulfur.dioxide vs pH : 0.113653090831958"
[1] "free.sulfur.dioxide vs sulphates : 0.0915924709670703"
[1] "free.sulfur.dioxide vs alcohol : -0.773698400361301"
[1] 46.46779
[1] "Data total.sulfur.dioxide is NOT Normally Distributed! with: 0.934"
[1] "total.sulfur.dioxide vs fixed.acidity : -6.48234585758778"
[1] "total.sulfur.dioxide vs volatile.acidity : 0.450425692371875"
[1] "total.sulfur.dioxide vs citric.acid : 0.227697274031564"
[1] "total.sulfur.dioxide vs residual.sugar : 9.4164414789907"
[1] "total.sulfur.dioxide vs chlorides : 0.0733867502451861"
[1] "total.sulfur.dioxide vs free.sulfur.dioxide : 229.737520947463"
[1] "total.sulfur.dioxide vs density : 0.00442472714485978"
[1] "total.sulfur.dioxide vs pH : -0.337698792502511"
[1] "total.sulfur.dioxide vs sulphates : 0.239471004640729"
[1] "total.sulfur.dioxide vs alcohol : -7.20929789503922"
[1] 0.9967467
[1] "Data density is Normally Distributed! with: 0.995"
[1] "density vs fixed.acidity : 0.00219522357567034"
[1] "density vs volatile.acidity : 0.00000744366515837123"
[1] "density vs citric.acid : 0.000134174581031167"
[1] "density vs residual.sugar : 0.000945410861841846"
[1] "density vs chlorides : 0.000017821756780873"
[1] "density vs free.sulfur.dioxide : -0.000433250416209755"
[1] "density vs total.sulfur.dioxide : 0.00442472714485978"
[1] "density vs pH : -0.0000995639480166344"
[1] "density vs sulphates : 0.0000475096184959153"
[1] "density vs alcohol : -0.000997951789525837"
[1] 3.311113
[1] "Data pH is Normally Distributed! with: 0.997"
[1] "pH vs fixed.acidity : -0.183585703596037"
[1] "pH vs volatile.acidity : 0.0064946993036167"
[1] "pH vs citric.acid : -0.0162975823437834"
[1] "pH vs residual.sugar : -0.0186442889838064"
[1] "pH vs chlorides : -0.00192574495871559"
[1] "pH vs free.sulfur.dioxide : 0.113653090831958"
[1] "pH vs total.sulfur.dioxide : -0.337698792502511"
[1] "pH vs density : -0.0000995639480166344"
[1] "pH vs sulphates : -0.0051461858201426"
[1] "pH vs alcohol : 0.0338316166393107"
[1] 0.6581488
[1] "Data sulphates is NOT Normally Distributed! with: 0.912"
[1] "sulphates vs fixed.acidity : 0.0540100915700598"
[1] "sulphates vs volatile.acidity : -0.00792143384358653"
[1] "sulphates vs citric.acid : 0.0103277145212003"
[1] "sulphates vs residual.sugar : 0.00132094135806093"
[1] "sulphates vs chlorides : 0.00296187794937543"
[1] "sulphates vs free.sulfur.dioxide : 0.0915924709670703"
[1] "sulphates vs total.sulfur.dioxide : 0.239471004640729"
[1] "sulphates vs density : 0.0000475096184959153"
[1] "sulphates vs pH : -0.0051461858201426"
[1] "sulphates vs alcohol : 0.0169067772332677"
[1] 10.42298
[1] "Data alcohol is NOT Normally Distributed! with: 0.964"
[1] "alcohol vs fixed.acidity : -0.114421153396092"
[1] "alcohol vs volatile.acidity : -0.0386002214306344"
[1] "alcohol vs citric.acid : 0.0228151729295766"
[1] "alcohol vs residual.sugar : 0.0632189597926113"
[1] "alcohol vs chlorides : -0.0110915177743286"
[1] "alcohol vs free.sulfur.dioxide : -0.773698400361301"
[1] "alcohol vs total.sulfur.dioxide : -7.20929789503922"
[1] "alcohol vs density : -0.000997951789525837"
[1] "alcohol vs pH : 0.0338316166393107"
[1] "alcohol vs sulphates : 0.0169067772332677"
df_normal <- data.frame(matrix(nrow=n, ncol = 11))
colnames(df_normal) <- names(df)
normal <- c()
not_normal <- c()
for (col in names(df)){
tryCatch(
{
boxcoxTransc <- boxcox(df[[col]] ~ 1,lambda=seq(-2.5, 2.5,.01))
title(col)
flagidx <- which(boxcoxTransc$y==max(boxcoxTransc$y))
optlam <- boxcoxTransc$x[flagidx]
vec <- df[[col]]
transvec <- (vec^optlam-1)/optlam #according to (4-34)#
# transformed data#
qqts <- qqnorm(transvec, main = paste("QQ - Plot: ", col))
cortrans <- cor(qqts$x, qqts$y)
},
error = function(cond) {
message(paste("Data NOT transformed: ", col))
message("Here's the original error message:")
message(conditionMessage(cond))
# Choose a return value in case of error
qqts <- qqnorm(df[[col]], main = paste("QQ - Plot: ", col))
qqline(df[[col]], col='orange', lwd=2)
cortrans <- cor(qqts$x, qqts$y)
return(cortrans)
},
finally = {
if (round(cortrans, 2) >= round(critic[[2]], 3)){
normal <- append(normal, col)
qqline(transvec, col='blue', lwd=2)
print(paste('Data ', col, ' is Normally Distributed! with: ', round(cortrans,3)))
df_normal[[col]] <- transvec
} else {
not_normal <- append(not_normal, col)
qqline(transvec, col='orange', lwd=2)
print(paste('Data ', col, ' is NOT Normally Distributed! with: ',
round(cortrans,3)))
}
}
)
}
[1] "Data fixed.acidity is Normally Distributed! with: 0.997"
[1] "Data volatile.acidity is Normally Distributed! with: 0.998"
Data NOT transformed: citric.acid
Here's the original error message:
response variable must be positive
[1] "Data citric.acid is Normally Distributed! with: 0.998"
[1] "Data residual.sugar is NOT Normally Distributed! with: 0.988"
[1] "Data chlorides is NOT Normally Distributed! with: 0.933"
[1] "Data free.sulfur.dioxide is NOT Normally Distributed! with: 0.994"
[1] "Data total.sulfur.dioxide is Normally Distributed! with: 0.995"
[1] "Data density is Normally Distributed! with: 0.996"
[1] "Data pH is Normally Distributed! with: 0.998"
[1] "Data sulphates is Normally Distributed! with: 0.998"
[1] "Data alcohol is NOT Normally Distributed! with: 0.985"
unique(df$citric.acid)
[1] 0.00 0.04 0.56 0.06 0.02 0.36 0.08 0.29 0.18 0.19 0.28 0.51 0.48 0.31 0.21 0.11
[17] 0.14 0.16 0.24 0.07 0.12 0.25 0.09 0.30 0.20 0.22 0.15 0.43 0.52 0.23 0.37 0.26
[33] 0.57 0.40 0.49 0.05 0.54 0.64 0.70 0.47 0.44 0.17 0.68 0.53 0.10 0.01 0.55 1.00
[49] 0.03 0.42 0.33 0.32 0.35 0.60 0.74 0.58 0.50 0.76 0.46 0.45 0.38 0.39 0.66 0.62
[65] 0.67 0.79 0.63 0.61 0.71 0.65 0.59 0.34 0.69 0.73 0.72 0.41 0.27 0.75 0.13 0.78
df_norm <- df_normal[normal]
df_norm <- within(df_norm, rm('citric.acid'))
pairs(df_norm)
DensityPlots(df_norm)
[1] 1.123393
[1] "Data fixed.acidity is Normally Distributed! with: 0.997"
[1] "fixed.acidity vs volatile.acidity : -0.00325578336388052"
[1] "fixed.acidity vs total.sulfur.dioxide : -0.00445079118867292"
[1] "fixed.acidity vs density : 0.0000620357104893582"
[1] "fixed.acidity vs pH : -0.00154673826556414"
[1] "fixed.acidity vs sulphates : 0.00316375032430066"
[1] -0.5907973
[1] "Data volatile.acidity is Normally Distributed! with: 0.998"
[1] "volatile.acidity vs fixed.acidity : -0.00325578336388052"
[1] "volatile.acidity vs total.sulfur.dioxide : 0.0201015425654002"
[1] "volatile.acidity vs density : 0.0000183855215118722"
[1] "volatile.acidity vs pH : 0.00269943316805013"
[1] "volatile.acidity vs sulphates : -0.0282823793958877"
[1] 3.960085
[1] "Data total.sulfur.dioxide is Normally Distributed! with: 0.995"
[1] "total.sulfur.dioxide vs fixed.acidity : -0.00445079118867292"
[1] "total.sulfur.dioxide vs volatile.acidity : 0.0201015425654002"
[1] "total.sulfur.dioxide vs density : 0.00016722798065971"
[1] "total.sulfur.dioxide vs pH : -0.000706287076722766"
[1] "total.sulfur.dioxide vs sulphates : 0.011014698090896"
[1] -0.003278254
[1] "Data density is Normally Distributed! with: 0.996"
[1] "density vs fixed.acidity : 0.0000620357104893582"
[1] "density vs volatile.acidity : 0.0000183855215118722"
[1] "density vs total.sulfur.dioxide : 0.00016722798065971"
[1] "density vs pH : -0.0000292094469761482"
[1] "density vs sulphates : 0.000115258678596354"
[1] 1.17496
[1] "Data pH is Normally Distributed! with: 0.998"
[1] "pH vs fixed.acidity : -0.00154673826556414"
[1] "pH vs volatile.acidity : 0.00269943316805013"
[1] "pH vs total.sulfur.dioxide : -0.000706287076722766"
[1] "pH vs density : -0.0000292094469761482"
[1] "pH vs sulphates : -0.0017818484264631"
[1] -0.6092693
[1] "Data sulphates is Normally Distributed! with: 0.998"
[1] "sulphates vs fixed.acidity : 0.00316375032430066"
[1] "sulphates vs volatile.acidity : -0.0282823793958877"
[1] "sulphates vs total.sulfur.dioxide : 0.011014698090896"
[1] "sulphates vs density : 0.000115258678596354"
[1] "sulphates vs pH : -0.0017818484264631"
df_to_scale <- df[not_normal]
df_to_scale$citric.acid <- df$citric.acid
scale_data <- as.data.frame(scale(df_to_scale))
pairs(scale_data)
DensityPlots(scale_data)
[1] -0.0000000000000001156003
[1] "Data residual.sugar is NOT Normally Distributed! with: 0.752"
[1] "residual.sugar vs chlorides : 0.0556095352035322"
[1] "residual.sugar vs free.sulfur.dioxide : 0.187048995104287"
[1] "residual.sugar vs alcohol : 0.0420754372097311"
[1] "residual.sugar vs citric.acid : 0.143577161570314"
[1] 0.00000000000000008613634
[1] "Data chlorides is NOT Normally Distributed! with: 0.695"
[1] "chlorides vs residual.sugar : 0.0556095352035322"
[1] "chlorides vs free.sulfur.dioxide : 0.00556214700478112"
[1] "chlorides vs alcohol : -0.221140544788283"
[1] "chlorides vs citric.acid : 0.203822913829042"
[1] -0.00000000000000005600528
[1] "Data free.sulfur.dioxide is NOT Normally Distributed! with: 0.95"
[1] "free.sulfur.dioxide vs residual.sugar : 0.187048995104287"
[1] "free.sulfur.dioxide vs chlorides : 0.00556214700478112"
[1] "free.sulfur.dioxide vs alcohol : -0.0694083535649999"
[1] "free.sulfur.dioxide vs citric.acid : -0.0609781291923049"
[1] 0.00000000000000008786086
[1] "Data alcohol is NOT Normally Distributed! with: 0.964"
[1] "alcohol vs residual.sugar : 0.0420754372097311"
[1] "alcohol vs chlorides : -0.221140544788283"
[1] "alcohol vs free.sulfur.dioxide : -0.0694083535649999"
[1] "alcohol vs citric.acid : 0.109903246641567"
[1] -0.00000000000000009207575
[1] "Data citric.acid is NOT Normally Distributed! with: 0.977"
[1] "citric.acid vs residual.sugar : 0.143577161570314"
[1] "citric.acid vs chlorides : 0.203822913829042"
[1] "citric.acid vs free.sulfur.dioxide : -0.0609781291923049"
[1] "citric.acid vs alcohol : 0.109903246641567"
log_scale <- log(df_to_scale)
pairs(log_scale)
DensityPlots(log_scale)
[1] 0.8502318
[1] "Data residual.sugar is NOT Normally Distributed! with: 0.925"
[1] "residual.sugar vs chlorides : 0.0136963986844303"
[1] "residual.sugar vs free.sulfur.dioxide : 0.0229861717575993"
[1] "residual.sugar vs alcohol : 0.00281540290773283"
[1] "residual.sugar vs citric.acid : NaN"
[1] -2.505462
[1] "Data chlorides is NOT Normally Distributed! with: 0.91"
[1] "chlorides vs residual.sugar : 0.0136963986844303"
[1] "chlorides vs free.sulfur.dioxide : -0.00304888609384212"
[1] "chlorides vs alcohol : -0.00989314903575179"
[1] "chlorides vs citric.acid : NaN"
[1] 2.546132
[1] "Data free.sulfur.dioxide is NOT Normally Distributed! with: 0.992"
[1] "free.sulfur.dioxide vs residual.sugar : 0.0229861717575993"
[1] "free.sulfur.dioxide vs chlorides : -0.00304888609384212"
[1] "free.sulfur.dioxide vs alcohol : -0.00570807621737924"
[1] "free.sulfur.dioxide vs citric.acid : NaN"
[1] 2.339021
[1] "Data alcohol is NOT Normally Distributed! with: 0.973"
[1] "alcohol vs residual.sugar : 0.00281540290773283"
[1] "alcohol vs chlorides : -0.00989314903575179"
[1] "alcohol vs free.sulfur.dioxide : -0.00570807621737924"
[1] "alcohol vs citric.acid : NaN"
[1] -Inf
Error in plot.window(...) : need finite 'ylim' values
process <- preProcess(df_to_scale, method=c("range"))
norm_scale <- predict(process, df_to_scale)
pairs(norm_scale)
DensityPlots(norm_scale)
[1] 0.112247
[1] "Data residual.sugar is NOT Normally Distributed! with: 0.752"
[1] "residual.sugar vs chlorides : 0.000421956217428721"
[1] "residual.sugar vs free.sulfur.dioxide : 0.00266121112506778"
[1] "residual.sugar vs alcohol : 0.000666163959879993"
[1] "residual.sugar vs citric.acid : 0.00270097739531581"
[1] 0.1259875
[1] "Data chlorides is NOT Normally Distributed! with: 0.695"
[1] "chlorides vs residual.sugar : 0.000421956217428721"
[1] "chlorides vs free.sulfur.dioxide : 0.0000643867261729257"
[1] "chlorides vs alcohol : -0.00284872679448532"
[1] "chlorides vs citric.acid : 0.00311974086464712"
[1] 0.2095059
[1] "Data free.sulfur.dioxide is NOT Normally Distributed! with: 0.95"
[1] "free.sulfur.dioxide vs residual.sugar : 0.00266121112506778"
[1] "free.sulfur.dioxide vs chlorides : 0.0000643867261729257"
[1] "free.sulfur.dioxide vs alcohol : -0.00167648624130293"
[1] "free.sulfur.dioxide vs citric.acid : -0.00175002977356185"
[1] 0.3112282
[1] "Data alcohol is NOT Normally Distributed! with: 0.964"
[1] "alcohol vs residual.sugar : 0.000666163959879993"
[1] "alcohol vs chlorides : -0.00284872679448532"
[1] "alcohol vs free.sulfur.dioxide : -0.00167648624130293"
[1] "alcohol vs citric.acid : 0.00351002660455024"
[1] 0.2709756
[1] "Data citric.acid is NOT Normally Distributed! with: 0.977"
[1] "citric.acid vs residual.sugar : 0.00270097739531581"
[1] "citric.acid vs chlorides : 0.00311974086464712"
[1] "citric.acid vs free.sulfur.dioxide : -0.00175002977356185"
[1] "citric.acid vs alcohol : 0.00351002660455024"
df_stndardized <- df_to_scale
for (col in names(df_to_scale)){
df_stndardized[[col]] <- (df_to_scale[[col]] - mean(df_to_scale[[col]])) / sd(df_to_scale[[col]])
}
pairs(df_stndardized)
DensityPlots(df_stndardized)
[1] -0.0000000000000001156003
[1] "Data residual.sugar is NOT Normally Distributed! with: 0.752"
[1] "residual.sugar vs chlorides : 0.0556095352035322"
[1] "residual.sugar vs free.sulfur.dioxide : 0.187048995104287"
[1] "residual.sugar vs alcohol : 0.0420754372097311"
[1] "residual.sugar vs citric.acid : 0.143577161570314"
[1] 0.00000000000000008888973
[1] "Data chlorides is NOT Normally Distributed! with: 0.695"
[1] "chlorides vs residual.sugar : 0.0556095352035322"
[1] "chlorides vs free.sulfur.dioxide : 0.00556214700478112"
[1] "chlorides vs alcohol : -0.221140544788283"
[1] "chlorides vs citric.acid : 0.203822913829042"
[1] -0.00000000000000005600528
[1] "Data free.sulfur.dioxide is NOT Normally Distributed! with: 0.95"
[1] "free.sulfur.dioxide vs residual.sugar : 0.187048995104287"
[1] "free.sulfur.dioxide vs chlorides : 0.00556214700478112"
[1] "free.sulfur.dioxide vs alcohol : -0.0694083535649999"
[1] "free.sulfur.dioxide vs citric.acid : -0.0609781291923049"
[1] 0.00000000000000008080805
[1] "Data alcohol is NOT Normally Distributed! with: 0.964"
[1] "alcohol vs residual.sugar : 0.0420754372097311"
[1] "alcohol vs chlorides : -0.221140544788283"
[1] "alcohol vs free.sulfur.dioxide : -0.0694083535649999"
[1] "alcohol vs citric.acid : 0.109903246641567"
[1] -0.00000000000000009207575
[1] "Data citric.acid is NOT Normally Distributed! with: 0.977"
[1] "citric.acid vs residual.sugar : 0.143577161570314"
[1] "citric.acid vs chlorides : 0.203822913829042"
[1] "citric.acid vs free.sulfur.dioxide : -0.0609781291923049"
[1] "citric.acid vs alcohol : 0.109903246641567"
chiforbi <- FindcrikChi(n, 2)
BivariateQQ <- function(data_set){
for (col in names(data_set)){
for ( i in 1:ncol(data_set)){
if (col != names(data_set[i])){
dat <- c(col, names(data_set[i]))
X <- (data_set[dat])
mu <- colMeans(data_set[dat])
S <- cov(data_set[dat])
result <- c()
tryCatch(
{
for (row in 1:nrow(X)){
v <- as.matrix(X[row,])
result[row] <- mahalanobis(v, mu, S)
#result[row] <- (v-mu)%*%solve(S)%*%t(v-mu)
y <- sort(result)
# The second parameter is now '2'
# because we have only two variables (bivariate)
x <- qchisq(1:length(result)/(length(result)+1), 2)
}
plot(x, y)
if (round(cor(x, y),2) >= round(chiforbi[[2]],3)){
abline(0,1, col='blue', lwd=2)
print(paste(col, ' vs ', names(data_set[i]),
' is Normally Distributed! with: ', round(cor(x, y),3)))
} else {
abline(0,1, col='orange', lwd=2)
print(paste(col, ' vs ', names(data_set[i]),
' is NOT Normally Distributed! with: ', round(cor(x, y),3)))
}
},
error = function(cond) {
message(paste("Data NOT calculated: ", col, names(data_set[i])))
message("Here's the original error message:")
message(conditionMessage(cond))
# Choose a return value in case of error
NA
})
}
}
}
}
round(chiforbi[[2]],3)
[1] 0.999
BivariateQQ(df_norm)
[1] "fixed.acidity vs volatile.acidity is Normally Distributed! with: 0.996"
[1] "fixed.acidity vs total.sulfur.dioxide is Normally Distributed! with: 0.996"
[1] "fixed.acidity vs density is NOT Normally Distributed! with: 0.989"
[1] "fixed.acidity vs pH is NOT Normally Distributed! with: 0.985"
[1] "fixed.acidity vs sulphates is Normally Distributed! with: 0.996"
[1] "volatile.acidity vs fixed.acidity is Normally Distributed! with: 0.996"
[1] "volatile.acidity vs total.sulfur.dioxide is NOT Normally Distributed! with: 0.99"
[1] "volatile.acidity vs density is NOT Normally Distributed! with: 0.993"
[1] "volatile.acidity vs pH is NOT Normally Distributed! with: 0.991"
[1] "volatile.acidity vs sulphates is NOT Normally Distributed! with: 0.992"
[1] "total.sulfur.dioxide vs fixed.acidity is Normally Distributed! with: 0.996"
[1] "total.sulfur.dioxide vs volatile.acidity is NOT Normally Distributed! with: 0.99"
[1] "total.sulfur.dioxide vs density is NOT Normally Distributed! with: 0.988"
[1] "total.sulfur.dioxide vs pH is NOT Normally Distributed! with: 0.992"
[1] "total.sulfur.dioxide vs sulphates is NOT Normally Distributed! with: 0.988"
[1] "density vs fixed.acidity is NOT Normally Distributed! with: 0.989"
[1] "density vs volatile.acidity is NOT Normally Distributed! with: 0.993"
[1] "density vs total.sulfur.dioxide is NOT Normally Distributed! with: 0.988"
[1] "density vs pH is NOT Normally Distributed! with: 0.968"
[1] "density vs sulphates is NOT Normally Distributed! with: 0.992"
[1] "pH vs fixed.acidity is NOT Normally Distributed! with: 0.985"
[1] "pH vs volatile.acidity is NOT Normally Distributed! with: 0.991"
[1] "pH vs total.sulfur.dioxide is NOT Normally Distributed! with: 0.992"
[1] "pH vs density is NOT Normally Distributed! with: 0.968"
[1] "pH vs sulphates is NOT Normally Distributed! with: 0.985"
[1] "sulphates vs fixed.acidity is Normally Distributed! with: 0.996"
[1] "sulphates vs volatile.acidity is NOT Normally Distributed! with: 0.992"
[1] "sulphates vs total.sulfur.dioxide is NOT Normally Distributed! with: 0.988"
[1] "sulphates vs density is NOT Normally Distributed! with: 0.992"
[1] "sulphates vs pH is NOT Normally Distributed! with: 0.985"
df2 <- cbind(df_norm, df_to_scale, data[, 13:14])
df_grate <- df2[df2$label=='great', ]
df_good <- df2[df2$label=='good', ]
df_poor <- df2[df2$label=='poor', ]
df_1 <- df2[df2$y== 1, ]
df_0 <- df2[df2$y ==0, ]
Trying to calculate Box’s M Test manually
n1 <- nrow(df_grate)
n2 <- nrow(df_good)
n3 <- nrow(df_poor)
n4 <- nrow(df_1)
n5 <- nrow(df_0)
m1 <- colMeans(df_grate[,1:11])
m2 <- colMeans(df_good[,1:11])
m3 <- colMeans(df_poor[,1:11])
m4 <- colMeans(df_1[,1:11])
m5 <- colMeans(df_0[,1:11])
s1 <- cov(df_grate[,1:11])
s2 <- cov(df_good[,1:11])
s3 <- cov(df_poor[,1:11])
s4 <- cov(df_1[,1:11])
s5 <- cov(df_0[,1:11])
sp <- ((n1-1)*s1+(n2-1)*s2+(n3-1)*s3)/(n1+n2+n3-3) #Spooled is HERE#
spi <- solve(sp)
spi
fixed.acidity volatile.acidity total.sulfur.dioxide
fixed.acidity 3110.857806 7.3185965 24.2149922
volatile.acidity 7.318596 28.0303581 -1.7038937
total.sulfur.dioxide 24.214992 -1.7038937 3.4452423
density -55207.110776 -1399.9130816 -319.3604423
pH 1638.546160 9.2556597 12.2910877
sulphates 29.824769 4.4528734 -0.3076922
residual.sugar 22.163691 0.2093982 0.1021050
chlorides 278.462380 -30.8393005 4.6354535
free.sulfur.dioxide -1.073089 0.1036913 -0.1898212
alcohol -36.728423 -1.2446720 0.2364624
citric.acid -126.463812 25.3255169 -3.5068357
density pH sulphates residual.sugar
fixed.acidity -55207.11078 1638.5461603 29.82476937 22.16369060
volatile.acidity -1399.91308 9.2556597 4.45287343 0.20939815
total.sulfur.dioxide -319.36044 12.2910877 -0.30769222 0.10210499
density 1671992.60547 -29088.5007914 -1414.61288769 -705.46278416
pH -29088.50079 1673.6982443 7.71877174 12.09372726
sulphates -1414.61289 7.7187717 12.88993094 0.69844840
residual.sugar -705.46278 12.0937273 0.69844840 0.84517595
chlorides -1677.20780 243.3817781 -26.14102361 0.18083640
free.sulfur.dioxide 26.39205 -0.7273122 -0.02061248 -0.02557939
alcohol 1361.28259 -27.9828590 -1.62791964 -0.62976077
citric.acid -983.33987 39.3121655 -0.44452752 -0.30768621
chlorides free.sulfur.dioxide alcohol citric.acid
fixed.acidity 278.4623803 -1.073088817 -36.728423413 -126.4638122
volatile.acidity -30.8393005 0.103691339 -1.244671969 25.3255169
total.sulfur.dioxide 4.6354535 -0.189821166 0.236462372 -3.5068357
density -1677.2077972 26.392049897 1361.282585942 -983.3398690
pH 243.3817781 -0.727312174 -27.982858994 39.3121655
sulphates -26.1410236 -0.020612480 -1.627919637 -0.4445275
residual.sugar 0.1808364 -0.025579391 -0.629760767 -0.3076862
chlorides 622.7774938 -0.168382670 4.727317172 -54.3304842
free.sulfur.dioxide -0.1683827 0.020594188 0.005806883 0.1767275
alcohol 4.7273172 0.005806883 2.435059048 -3.1048161
citric.acid -54.3304842 0.176727524 -3.104816121 80.3449869
sp_2 <- ((n4-1)*s4+(n5-1)*s5)/(n4+n5-2) #Spooled is HERE#
spi_2 <- solve(sp_2)
spi_2
fixed.acidity volatile.acidity total.sulfur.dioxide
fixed.acidity 3108.141661 5.5979305 24.5672927
volatile.acidity 5.597930 27.4099149 -1.5802897
total.sulfur.dioxide 24.567293 -1.5802897 3.4230392
density -55108.160031 -1351.4058851 -329.2106435
pH 1631.987245 6.4562783 12.8469440
sulphates 30.309543 4.6280245 -0.3415658
residual.sugar 22.065924 0.1682393 0.1102374
chlorides 275.882362 -31.8772479 4.8374055
free.sulfur.dioxide -1.075234 0.1032118 -0.1898337
alcohol -36.595349 -1.1877300 0.2253312
citric.acid -127.562063 24.9645491 -3.4353962
density pH sulphates residual.sugar
fixed.acidity -55108.16003 1631.987245 30.30954341 22.06592401
volatile.acidity -1351.40589 6.456278 4.62802455 0.16823933
total.sulfur.dioxide -329.21064 12.846944 -0.34156577 0.11023737
density 1669217.66640 -28889.593985 -1428.84070457 -702.70878282
pH -28889.59399 1662.413129 8.48144409 11.91976107
sulphates -1428.84070 8.481444 12.85144297 0.71004077
residual.sugar -702.70878 11.919761 0.71004077 0.84303333
chlorides -1599.41289 239.055592 -25.88221530 0.11502708
free.sulfur.dioxide 26.45073 -0.730162 -0.02047829 -0.02563066
alcohol 1357.66761 -27.746601 -1.64453352 -0.62641975
citric.acid -954.78733 37.679928 -0.34300162 -0.33226680
chlorides free.sulfur.dioxide alcohol citric.acid
fixed.acidity 275.8823621 -1.075233605 -36.595348932 -127.5620633
volatile.acidity -31.8772479 0.103211799 -1.187730031 24.9645491
total.sulfur.dioxide 4.8374055 -0.189833700 0.225331221 -3.4353962
density -1599.4128909 26.450733628 1357.667611230 -954.7873300
pH 239.0555920 -0.730161982 -27.746601290 37.6799281
sulphates -25.8822153 -0.020478293 -1.644533522 -0.3430016
residual.sugar 0.1150271 -0.025630657 -0.626419754 -0.3322668
chlorides 621.5413949 -0.169357525 4.822436338 -54.9661725
free.sulfur.dioxide -0.1693575 0.020606627 0.005859784 0.1765166
alcohol 4.8224363 0.005859784 2.431362556 -3.0726683
citric.acid -54.9661725 0.176516643 -3.072668332 80.1727517
box_m(df2[,1:11],df2[,"label"])
box_m(df2[,1:11],df2[,"y"])
df2$label <- with(df2, ifelse(label == 'great', 2,
ifelse(label == 'good', 1, 0)))
set.seed(0)
sample <- sample.split(df2, SplitRatio = 0.7)
train <- subset(df2, sample == TRUE)
test <- subset(df2, sample == FALSE)
multi_model <- multinom(label ~ ., data = train[, 1:12])
# weights: 39 (24 variable)
initial value 1216.163804
iter 10 value 619.438991
iter 20 value 437.752258
iter 30 value 435.032107
iter 40 value 434.920188
iter 50 value 434.402810
iter 50 value 434.402806
final value 434.402806
converged
pred_multi <- predict(multi_model, newdata = test[, 1:12], type = "class")
correct_predictions <- sum(pred_multi == test$label)
correct_predictions
[1] 419
xtab <- table(pred_multi, test$label)
cm <- caret::confusionMatrix(xtab)
cm
Confusion Matrix and Statistics
pred_multi 0 1 2
0 0 0 0
1 22 398 47
2 0 4 21
Overall Statistics
Accuracy : 0.8516
95% CI : (0.8171, 0.8819)
No Information Rate : 0.8171
P-Value [Acc > NIR] : 0.02493
Kappa : 0.3176
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 0 Class: 1 Class: 2
Sensitivity 0.00000 0.9900 0.30882
Specificity 1.00000 0.2333 0.99057
Pos Pred Value NaN 0.8522 0.84000
Neg Pred Value 0.95528 0.8400 0.89936
Prevalence 0.04472 0.8171 0.13821
Detection Rate 0.00000 0.8089 0.04268
Detection Prevalence 0.00000 0.9492 0.05081
Balanced Accuracy 0.50000 0.6117 0.64969
# Accuracy = TP / TOTAL
print(paste('Accuracy: ', (397 + 21) / 492 ))
[1] "Accuracy: 0.849593495934959"
# Recall = TP / (TP + FN)
Metrics::recall(pred_multi, test$label)
[1] 1.053533
# Precision = TP / (TP + FP)
Metrics::precision(pred_multi, test$label)
Warning: argument is not numeric or logical: returning NA
[1] NA
# F1 = 2 * (Precision * Recall) / (Precision + Recall)
Metrics::f1(pred_multi, test$label)
[1] 0.8
#𝐸(APER)
aer(test$label, pred_multi)
[1] 0.148374
model_3 <- lm(label ~ ., data=train[, 1:12], CV=TRUE)
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
extra argument ‘CV’ will be disregarded
summary(model_3)
Call:
lm(formula = label ~ ., data = train[, 1:12], CV = TRUE)
Residuals:
Min 1Q Median 3Q Max
-1.39692 -0.18171 -0.00273 0.11476 1.09962
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.56843323 1.06168373 -0.535 0.59248
fixed.acidity 1.05831354 0.58307781 1.815 0.06979 .
volatile.acidity -0.29533790 0.05562280 -5.310 0.0000001329 ***
total.sulfur.dioxide -0.00776280 0.01988918 -0.390 0.69639
density -34.46398517 13.56120834 -2.541 0.01118 *
pH -0.36774762 0.42949206 -0.856 0.39205
sulphates 0.21313571 0.03770351 5.653 0.0000000201 ***
residual.sugar 0.02182609 0.00998938 2.185 0.02911 *
chlorides -0.69182073 0.25312288 -2.733 0.00637 **
free.sulfur.dioxide -0.00009178 0.00152391 -0.060 0.95199
alcohol 0.07598378 0.01633404 4.652 0.0000036909 ***
citric.acid -0.02654150 0.09526130 -0.279 0.78059
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3531 on 1095 degrees of freedom
Multiple R-squared: 0.2394, Adjusted R-squared: 0.2317
F-statistic: 31.32 on 11 and 1095 DF, p-value: < 0.00000000000000022
plot(model_3, pch=df2$label)
pred_train_3 <- predict(model_3, train[, 1:12], type="response")
pred_test_3 <- predict(model_3, newdata = test[, 1:12], type="response")
train_TAB <- table(train$label, pred_train_3 > 0.95)
train_TAB
FALSE TRUE
0 27 14
1 260 657
2 1 148
test_TAB <- table(test$label, pred_test_3 > 0.95)
test_TAB
FALSE TRUE
0 10 12
1 114 288
2 1 67
model_binorm <- glm(y ~., data = train[, -12], family = binomial)
predictions <- predict(model_binorm, newdata = test[, -12], type = "response")
predicted_classes <- ifelse(predictions > 0.5, 0, 1)
mean(predicted_classes == test$y)
xtab <- table(predicted_classes, test$y)
cm <- caret::confusionMatrix(xtab)
cm
Confusion Matrix and Statistics
predicted_classes 0 1
0 5 22
1 419 46
Accuracy : 0.1037
95% CI : (0.0782, 0.134)
No Information Rate : 0.8618
P-Value [Acc > NIR] : 1
Kappa : -0.0903
Mcnemar's Test P-Value : <0.0000000000000002
Sensitivity : 0.01179
Specificity : 0.67647
Pos Pred Value : 0.18519
Neg Pred Value : 0.09892
Prevalence : 0.86179
Detection Rate : 0.01016
Detection Prevalence : 0.05488
Balanced Accuracy : 0.34413
'Positive' Class : 0
# Accuracy = TP / TOTAL
print(paste('Accuracy: ', (5 + 46) / 492 ))
[1] "Accuracy: 0.103658536585366"
# Recall = TP / (TP + FN)
Metrics::recall(predicted_classes, test$y)
[1] 0.09892473
# Precision = TP / (TP + FP)
Metrics::precision(predicted_classes, test$y)
[1] 0.6764706
# F1 = 2 * (Precision * Recall) / (Precision + Recall)
Metrics::f1(predicted_classes, test$y)
[1] 1
#𝐸(APER)
aer(test$y, predicted_classes)
[1] 0.8963415
#ROC -curve
roc_curve <- roc(ifelse(test$y == 0, 1, 0), ifelse(predicted_classes == 0, 1, 0))
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot ROC curve
plot(roc_curve, main = "ROC Curve", col = "blue")
fit <- vglm(label~., family=multinomial, data=train[, 1:12])
summary(fit)
Call:
vglm(formula = label ~ ., family = multinomial, data = train[,
1:12])
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept):1 3.618396 21.796586 0.166 0.868151
(Intercept):2 16.366926 10.291605 NA NA
fixed.acidity:1 -3.402300 11.974217 NA NA
fixed.acidity:2 -8.647151 5.519332 -1.567 0.117184
volatile.acidity:1 5.828700 1.041344 5.597 0.000000021773 ***
volatile.acidity:2 1.443152 0.575845 2.506 0.012206 *
total.sulfur.dioxide:1 -0.214360 0.416160 -0.515 0.606491
total.sulfur.dioxide:2 0.772187 0.231525 3.335 0.000852 ***
density:1 158.130232 274.549540 0.576 0.564641
density:2 267.396317 131.762985 2.029 0.042420 *
pH:1 12.472557 8.730555 1.429 0.153116
pH:2 0.369114 4.304543 0.086 0.931665
sulphates:1 -2.897919 0.739775 -3.917 0.000089547258 ***
sulphates:2 -2.572639 0.410467 -6.268 0.000000000367 ***
residual.sugar:1 -0.133223 0.195124 -0.683 0.494758
residual.sugar:2 -0.207533 0.089742 -2.313 0.020747 *
chlorides:1 10.217731 4.665817 2.190 0.028531 *
chlorides:2 5.954070 3.462630 1.720 0.085519 .
free.sulfur.dioxide:1 -0.004007 0.032871 -0.122 0.902977
free.sulfur.dioxide:2 -0.022308 0.016551 -1.348 0.177720
alcohol:1 -1.311135 0.346698 -3.782 0.000156 ***
alcohol:2 -0.639605 0.158989 -4.023 0.000057472394 ***
citric.acid:1 2.271508 1.769887 1.283 0.199345
citric.acid:2 -0.254450 1.017662 -0.250 0.802561
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Names of linear predictors: log(mu[,1]/mu[,3]), log(mu[,2]/mu[,3])
Residual deviance: 865.3682 on 2190 degrees of freedom
Log-likelihood: -432.6841 on 2190 degrees of freedom
Number of Fisher scoring iterations: 7
Warning: Hauck-Donner effect detected in the following estimate(s):
'(Intercept):2', 'fixed.acidity:1', 'alcohol:1'
Reference group is level 3 of the response
probabilities <- predict(fit, test[,1:12], type="response")
predictions <- apply(probabilities, 1, which.max)
predictions[which(predictions==0)] <- test$label == 0
predictions[which(predictions==1)] <- test$label == 1
predictions[which(predictions==3)] <- test$label == 2
Warning: number of items to replace is not a multiple of replacement length
# summarize accuracy
xtab_v <- table(predictions, test$label)
cm <- caret::confusionMatrix(xtab_v)
cm
Confusion Matrix and Statistics
predictions 0 1 2
0 0 5 20
1 0 0 1
2 22 397 47
Overall Statistics
Accuracy : 0.0955
95% CI : (0.071, 0.125)
No Information Rate : 0.8171
P-Value [Acc > NIR] : 1
Kappa : -0.0454
Mcnemar's Test P-Value : <0.0000000000000002
Statistics by Class:
Class: 0 Class: 1 Class: 2
Sensitivity 0.00000 0.000000 0.69118
Specificity 0.94681 0.988889 0.01179
Pos Pred Value 0.00000 0.000000 0.10086
Neg Pred Value 0.95289 0.181263 0.19231
Prevalence 0.04472 0.817073 0.13821
Detection Rate 0.00000 0.000000 0.09553
Detection Prevalence 0.05081 0.002033 0.94715
Balanced Accuracy 0.47340 0.494444 0.35148
# Accuracy = TP / TOTAL
print(paste('Accuracy: ', 47 / 492 ))
[1] "Accuracy: 0.0955284552845529"
# Recall = TP / (TP + FN)
Metrics::recall(predictions, test$labeæ)
Warning: argument is not numeric or logical: returning NA
[1] NA
# Precision = TP / (TP + FP)
Metrics::precision(predictions, test$label)
[1] 1.975124
# F1 = 2 * (Precision * Recall) / (Precision + Recall)
Metrics::f1(predictions, test$label)
[1] 1
#𝐸(APER)
aer(test$label, predictions)
[1] 0.9044715
num_data <- df2[, 1:11]
normalized <- scale(num_data)
#After that we create a correlation matrix
corr_matrix <- cor(normalized)
ggcorrplot(corr_matrix)
#Now we make a pca
data.pca <- princomp(corr_matrix)
summary(data.pca)
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
Standard deviation 0.8915682 0.5626588 0.4458347 0.3287614 0.24273993 0.22153175
Proportion of Variance 0.5090581 0.2027442 0.1272934 0.0692181 0.03773467 0.03142897
Cumulative Proportion 0.5090581 0.7118023 0.8390956 0.9083137 0.94604842 0.97747739
Comp.7 Comp.8 Comp.9 Comp.10
Standard deviation 0.15155186 0.089660453 0.054877700 0.0339192702
Proportion of Variance 0.01470892 0.005148255 0.001928635 0.0007368026
Cumulative Proportion 0.99218631 0.997334563 0.999263197 1.0000000000
Comp.11
Standard deviation 0.00000000533722824842028
Proportion of Variance 0.00000000000000001824273
Cumulative Proportion 1.00000000000000000000000
data.pca$loadings[, 1:2]
Comp.1 Comp.2
fixed.acidity 0.50837145 0.03432755
volatile.acidity -0.27437604 0.34818335
total.sulfur.dioxide -0.08619193 0.42217508
density 0.36088060 0.34221286
pH -0.46124636 -0.11623776
sulphates 0.19289017 -0.22326201
residual.sugar 0.07676416 0.13981543
chlorides 0.16932322 0.15362777
free.sulfur.dioxide -0.13509571 0.31081409
alcohol -0.11698193 -0.58865323
citric.acid 0.46060706 -0.18061985
pca_1_2 <-data.pca$loadings[, 1:2]
pca_1_2<-as.matrix(pca_1_2)
numerical_data<-as.matrix(num_data)
#we multiply the numerical_data with our first and second pricipal components
reduced_data<-numerical_data %*% pca_1_2
reduced_data<-as.data.frame(reduced_data)
reduced_data$predicted <- pred_multi <- predict(multi_model,
newdata = df2[, 1:12], type = "class")
reduced_data$true_class <- df2$label
plot1 <- ggplot(reduced_data, aes(x = Comp.1, y = Comp.2, colour = true_class)) +
geom_point()
plot2 <- ggplot(reduced_data, aes(x = Comp.1, y = Comp.2, colour = predicted)) +
geom_point()
plot1
plot2